#  Clear memory
rm(list=ls())

# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse, readxl, data.table, stringr, collapse, modelsummary, knitr, kableExtra, haven, usmap, tigris, ggpubr, fixest, housingData
)

# Clean and combine all QCEW data 
qcew_data_function = function(selected_year) {
  file_path = paste0("raw/qcew/allhlcn", sprintf('%02d', selected_year %% 100), ".xlsx")
  
  temp_file = read_excel(file_path) %>%
    filter(Industry == "Manufacturing") %>%
    filter(Cnty != 000) %>%
    filter(Cnty != "000") %>%
    mutate(average_pay_per_firm = `Annual Total Wages`/`Annual Average Establishment Count` ) %>%
    rename(avg_number_of_firms = `Annual Average Establishment Count`) %>%
    rename(annual_total_wages = `Annual Total Wages`) %>%
    rename(avg_employment = `Annual Average Employment`) %>%
    select(c("St", "Cnty", "average_pay_per_firm", "avg_number_of_firms", "annual_total_wages", "avg_employment")) %>%
    mutate(St = as.numeric(St)) %>%
    mutate(census_division = case_when(St %in% c(9, 23, 25, 33, 44, 50) ~ 1,
                                       St %in% c(34, 36, 42) ~ 2,
                                       St %in% c(17, 18, 26, 39, 55) ~ 3,
                                       St %in% c(19, 20, 27, 29, 31, 38, 46) ~ 4,
                                       St %in% c(10, 11, 12, 13, 24, 37, 45, 51, 54) ~ 5,
                                       St %in% c(1, 21, 28, 47) ~ 6,
                                       St %in% c(5, 22, 40, 48) ~ 7,
                                       St %in% c(4, 8, 16, 30, 32, 35, 49, 56) ~ 8,
                                       St %in% c(2, 6, 15, 41, 53) ~ 9,
                                       TRUE ~ 0)) %>%
    mutate(census_region = case_when(St %in% c(9, 23, 25, 33, 44, 50) ~ 1,
                                     St %in% c(34, 36, 42) ~ 1,
                                     St %in% c(17, 18, 26, 39, 55) ~ 2,
                                     St %in% c(19, 20, 27, 29, 31, 38, 46) ~ 2,
                                     St %in% c(10, 11, 12, 13, 24, 37, 45, 51, 54) ~ 3,
                                     St %in% c(1, 21, 28, 47) ~ 3,
                                     St %in% c(5, 22, 40, 48) ~ 3,
                                     St %in% c(4, 8, 16, 30, 32, 35, 49, 56) ~ 4,
                                     St %in% c(2, 6, 15, 41, 53) ~ 4,
                                     TRUE ~ 0)) %>%
    rename(state = St) %>%
    mutate(Cnty = as.numeric(Cnty)) %>%
    rename(county = Cnty) %>%
    mutate(year = selected_year)
  return(temp_file)
}

qcew1990 = qcew_data_function(selected_year = 1990)
qcew1996 = qcew_data_function(selected_year = 1996)
qcew1997 = qcew_data_function(selected_year = 1997)
qcew1998 = qcew_data_function(selected_year = 1998)
qcew1999 = qcew_data_function(selected_year = 1999)
qcew2000 = qcew_data_function(selected_year = 2000)
qcew2001 = qcew_data_function(selected_year = 2001)

qcew = rbind(qcew1990,
             qcew1996, 
             qcew1997, 
             qcew1998, 
             qcew1999, 
             qcew2000, 
             qcew2001)
qcew = qcew %>% 
  mutate(across(everything(), .fns = ~replace_na(.,0)))

# SIC --> NAICS xWalk 

sic87_to_naics12 <- fread("raw/CBP/industry_crosswalk/efsy_sic87_to_naics12.csv") %>%
  rename(SIC = sic87) %>%
  mutate(SIC = str_remove_all(SIC, "-"))

sic87_to_naics12$SIC = str_replace(sic87_to_naics12$SIC,"\\\\","")

sic87_to_naics12 = sic87_to_naics12 %>%
  mutate(SIC = as.integer(SIC))



# Make NEI function
nei_data_function = function(selected_year) {
  file_to_import = paste0("raw/nei/", selected_year, "SicSummarymade09082005.txt")
  temp_data = fread(file_to_import)    %>%
    full_join(sic87_to_naics12) %>%
    mutate(first_two_naics_digit = as.integer(substr(as.character(naics12), 1, 2))) %>%
    filter(first_two_naics_digit >= 31 & first_two_naics_digit <= 33) %>%
    mutate(weighted_pollution = weight*ANNUAL_EMISSION) %>%
    rename(state = STATE_FIPS, county = COUNTY_FIPS) %>%
    collap( 
      by = ~ state + county +POLLUTANT_CODE, 
      custom = list(fsum = c("weighted_pollution"))) %>% 
    filter(POLLUTANT_CODE != "PM25-FIL") %>%
    filter(POLLUTANT_CODE != "PM-PRI") %>%
    filter(POLLUTANT_CODE != "PM-FIL") %>%
    filter(POLLUTANT_CODE != "PM10-FIL") %>%
    filter(POLLUTANT_CODE != "PM-CON") %>%
    mutate(POLLUTANT_CODE = 
             case_when(POLLUTANT_CODE %in% c("PM25-PRI") ~ "PM25", 
                       POLLUTANT_CODE %in% c("PM10-PRI") ~ "PM10",
                       TRUE ~ POLLUTANT_CODE)) %>%
    pivot_wider(id_cols = c(state, county), names_from = POLLUTANT_CODE, values_from = weighted_pollution) %>%
    mutate(year = selected_year)
  return(temp_data)
}

nei1990 = nei_data_function(selected_year = 1990)
nei1996 = nei_data_function(selected_year = 1996)
nei1997 = nei_data_function(selected_year = 1997)
nei1998 = nei_data_function(selected_year = 1998)
nei1999 = nei_data_function(selected_year = 1999)
nei2000 = nei_data_function(selected_year = 2000)
nei2001 = nei_data_function(selected_year = 2001)


nei = rbind(nei1990,
            nei1996, 
            nei1997, 
            nei1998, 
            nei1999, 
            nei2000, 
            nei2001)
nei = nei %>% 
  mutate(across(everything(), .fns = ~replace_na(.,0)))


# Merge the two together 
combined = qcew %>% 
  filter(avg_employment !=0) %>%
  inner_join(nei) %>% 
  mutate(SO2_intensity = (SO2/avg_employment)*1) %>%
  mutate(VOC_intensity = (VOC/avg_employment)*1) %>%
  mutate(CO_intensity = (CO/avg_employment)*1) %>%
  mutate(NH3_intensity = (NH3/avg_employment)*1) %>%
  mutate(NOX_intensity = (NOX/avg_employment)*1) %>%
  mutate(PM10_intensity = (PM10/avg_employment)*1) %>%
  mutate(PM25_intensity = (PM25/avg_employment)*1) %>%
  mutate(str_st = str_pad(state, 2, "left", "0")) %>%
  mutate(str_cty = str_pad(county, 3, "left", "0")) %>%
  mutate(fips = paste0(str_st, str_cty)) %>%
  mutate(fips = as.numeric(fips)) %>%
  select(-c("str_st", "str_cty"))    

# Import non-attainment status
nonattainment <- fread("data/productivity_panel.csv") %>%
  filter(industry == "Manufacturing") 


# Create panel

pollutant_per_worker_panel = nonattainment %>%
  right_join(combined) %>%
  rename(SO2_amount = SO2) %>%
  rename(CO_amount = CO) %>%
  rename(NOX_amount = NOX) %>%
  rename(NH3_amount = NH3) %>%
  rename(PM10_amount = PM10) %>%
  rename(PM25_amount = PM25) %>%
  rename(VOC_amount = VOC) %>%
  pivot_longer(names_to = c("pollutant", "measure"), 
               cols = c(ends_with("_intensity"), ends_with("_amount")), 
               names_sep = "_",
               values_to = "value") %>%
  mutate(wages_per_worker = (annual_total_wages/avg_employment)*10000) %>% 
  mutate(annual_total_wages = annual_total_wages/1000000000) %>%
  pivot_wider(names_from = "measure" ) %>%
  group_by(fips) %>%
  mutate(new_first_non_1990 = mean(first_non_1990, na.rm = TRUE) ) %>%
  ungroup() %>%
  mutate(new_first_non_1990 = ifelse(is.nan(new_first_non_1990), 99999, new_first_non_1990)) %>%
  mutate(new_post_nonattainment = ifelse(year > new_first_non_1990, 1, 0)) %>%
  mutate(ihs_intensity = log(intensity  + sqrt(intensity  ^ 2 + 1))) %>%
  mutate(ihs_amount = log(amount  + sqrt(amount  ^ 2 + 1))) %>%
  mutate(ln_annual_total_wages = log(annual_total_wages))

fwrite(pollutant_per_worker_panel, file = "data/emissions.csv")
